#include<stdio.h>
#include<cutil.h>
#include<stdlib.h>
#include "satrules.cu"
#include "scankernel.cu"
#include "moveformulas.cu"
#include "read.c"

#define NB_CLAUSES 320
#define n 128

void createformulabatch(unsigned int* startformula){
	for(int i=0; i<n*NB_CLAUSES; i++){
		startformula[i]= 0;
	}
	unsigned int* entree = printtab();
	int length = sizeof(entree) / sizeof(int);
	for(int i=0; i<320; i++){
		startformula[i]=entree[i];
	}
} 

void init(int argc, char *argv[]){
	CUT_DEVICE_INIT(argc, argv);
	
	int initial_size = n*NB_CLAUSES*sizeof(int);
	int expanded_size = 3*n*NB_CLAUSES*sizeof(int);
	int invalidformula_size = 3*n*sizeof(int);
	
	unsigned int* formules;
	unsigned int* newformules;
	unsigned int* newformulesreorganized;
	int* invalidformula;
	int* verifiedclauses;
	int* newformulaorder;
	int* newclausesorder;
	int* totallength;
	int* solutionfound;
	int sol=0;
	
	//allocating the memory to store the results on the device
	CUDA_SAFE_CALL( cudaMalloc( (void**) &formules, initial_size));
	CUDA_SAFE_CALL( cudaMalloc( (void **) &newformules, expanded_size));
	CUDA_SAFE_CALL( cudaMalloc( (void **) &newformulesreorganized, expanded_size));
	CUDA_SAFE_CALL( cudaMalloc( (void**) &invalidformula, invalidformula_size));
	CUDA_SAFE_CALL( cudaMalloc( (void**) &verifiedclauses, expanded_size));
	CUDA_SAFE_CALL( cudaMalloc( (void **) &newformulaorder, invalidformula_size));
	CUDA_SAFE_CALL( cudaMalloc( (void **) &newclausesorder, expanded_size));
	CUDA_SAFE_CALL( cudaMalloc( (void **) &totallength, sizeof(int)));
	CUDA_SAFE_CALL( cudaMalloc( (void **) &solutionfound, sizeof(int)));
	
	//initializing the tab:
	int* goodclauses = (int *) malloc(expanded_size);
	unsigned int* formulabatch = (unsigned int*) malloc(1000*initial_size); //enough place to store the remaining formulas
	int* defaultinvalidformula = (int *) malloc(invalidformula_size);
	unsigned int* tmpbatch = (unsigned int*) malloc(expanded_size);
	for(int i=0; i<3*n*NB_CLAUSES; ++i){
		goodclauses[i]=1;
	}
	for(int i=0;i<3*n; ++i){
		defaultinvalidformula[i]=1;
	}
	//initialize the first batch of formulas
	createformulabatch(formulabatch);
	
	//copying defaultvalues
	CUDA_SAFE_CALL( cudaMemcpy(solutionfound, &sol, sizeof(int),cudaMemcpyHostToDevice) );
	CUDA_SAFE_CALL( cudaMemcpy(formules, formulabatch, initial_size,cudaMemcpyHostToDevice) );
	CUDA_SAFE_CALL( cudaMemcpy(verifiedclauses, goodclauses, expanded_size ,cudaMemcpyHostToDevice) );
	CUDA_SAFE_CALL( cudaMemcpy(newformulesreorganized, tmpbatch, expanded_size ,cudaMemcpyHostToDevice) );
	CUDA_SAFE_CALL( cudaMemcpy(newformules, tmpbatch, expanded_size ,cudaMemcpyHostToDevice) );
	CUDA_SAFE_CALL( cudaMemcpy(invalidformula, defaultinvalidformula, invalidformula_size ,cudaMemcpyHostToDevice));
	
	int curseur = 0;
	int l = 1;
	int batchsize=l;
	int found=0;
	int i=0;
	int formula_size=NB_CLAUSES;
	
	unsigned int* finalformulas = new unsigned int[expanded_size/sizeof(int)];
	int* invalidatedformula = (int *) malloc(invalidformula_size);
	
	//initialize dimensions used un different kernels
	
	
	printf("debut du traitement:\n");
	while(curseur + l > 0){
		//initializing array with the good value
		int memtoupload = NB_CLAUSES*batchsize*3*sizeof(unsigned int);
		CUDA_SAFE_CALL( cudaMemcpy(verifiedclauses, goodclauses, memtoupload ,cudaMemcpyHostToDevice) );
		CUDA_SAFE_CALL( cudaMemcpy(newformules, tmpbatch, memtoupload ,cudaMemcpyHostToDevice) );
		CUDA_SAFE_CALL( cudaMemcpy(newformulesreorganized, newformules, memtoupload ,cudaMemcpyDeviceToDevice) );
		CUDA_SAFE_CALL( cudaMemcpy(invalidformula, defaultinvalidformula, invalidformula_size ,cudaMemcpyHostToDevice));
		
		//------------------------------------
		// applying satrules
		dim3 grid(n/2, 1, 1);
		int num_threads = 192;
		dim3 threads(num_threads, 1, 1);
		int sharedmem_sat = 6*sizeof(unsigned int);
		applyrules<<< grid, threads, 10000 >>>(n,formules, newformules, invalidformula, verifiedclauses, batchsize);
		cudaThreadSynchronize();
		CUT_CHECK_ERROR("erreur lors du kernel SAT");
		
		
		//-------------------------------------
		//finding the order of each formula:
		l = 0;
		dim3 grid2(1,1,1);
		dim3 threads2(3*n,1,1);
		int sharedmem_scan = 2*batchsize*3*sizeof(int);
		scan<<< grid2, threads2, sharedmem_scan >>>(newformulaorder,invalidformula,  batchsize*3, totallength);
		cudaThreadSynchronize();
		CUT_CHECK_ERROR("kernel execution: apres premier scan");
		CUDA_SAFE_CALL( cudaMemcpy(&l,totallength, sizeof(int), cudaMemcpyDeviceToHost) );
		
		
		//--------------------------------------
		// finding the order of each clause
		found =0;
		dim3 grid_scan(batchsize*3,1,1);
		dim3 threads_scan(NB_CLAUSES,1,1);
		int sharedmem = 2 * NB_CLAUSES * sizeof(int);
		scanclauses<<< grid_scan, threads_scan, sharedmem>>>(newclausesorder, verifiedclauses, NB_CLAUSES, solutionfound, batchsize*3, invalidformula);
		cudaThreadSynchronize();
		CUT_CHECK_ERROR("erreur apres second scan");
		CUDA_SAFE_CALL( cudaMemcpy(&found,solutionfound, sizeof(int),cudaMemcpyDeviceToHost) );
		if(i%1000==0) {
			printf("etape %i, %i elements dans le batch.\n",i,l);
		}
		if(found==1){
			printf("solution found !!!");
			break;
		}
		i++;
		//--------------------------------------
		// moving each clauses to its right place
		moveWS<<< grid_scan, threads_scan>>>(newformules, newformulesreorganized, invalidformula, verifiedclauses, newformulaorder,newclausesorder, NB_CLAUSES, batchsize*3);
		cudaThreadSynchronize();
		CUT_CHECK_ERROR("erreur apres la mobilisation");
		
		//--------------------------------------
		//swapping the result with our limit in the buffer
		if(l > n){
			// if the batch is full, we store some formulas
			CUDA_SAFE_CALL( cudaMemcpy(formulabatch+curseur*formula_size, newformulesreorganized+n*formula_size, (l-n)*formula_size*sizeof(int) ,cudaMemcpyDeviceToHost));
			curseur += l - n;
			batchsize=n;
		}
		else if (l < n){
			// if the batch is not empty, we fill it to capacity
			CUDA_SAFE_CALL( cudaMemcpy(newformulesreorganized + l*formula_size, formulabatch  + max(0, curseur - (n - l))*formula_size, min(curseur, n-l)*formula_size*sizeof(int),cudaMemcpyHostToDevice) );
			curseur = max(0, curseur - (n - l));
			batchsize= l +min(curseur, n-l);
		}
		CUDA_SAFE_CALL( cudaMemcpy(formules, newformulesreorganized, n*formula_size*sizeof(int) ,cudaMemcpyDeviceToDevice));
	}
	if(found==0){
		printf("aucune solution trouvee\n");
	}
}

int main(int argc, char *argv[]){
	init(argc,argv);
	CUT_EXIT(argc, argv);
}
